The goal of this notebook is to explore by different ways the companies' bankruptcy data, analyse the general behavior of data and all form of correlation between the variables and finally to know what variables are useless and have to be ejected.
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
init_notebook_mode()
warnings.filterwarnings("ignore")
dirpath = Path('../data/extension/csv')
lstfile = list(dirpath.glob('*year.csv'))
lstfile
[WindowsPath('../data/extension/csv/1year.csv'),
WindowsPath('../data/extension/csv/2year.csv'),
WindowsPath('../data/extension/csv/3year.csv'),
WindowsPath('../data/extension/csv/4year.csv'),
WindowsPath('../data/extension/csv/5year.csv')]
dataset = {i.stem:pd.read_csv(i) for i in lstfile}
shapes = {x:y.shape for x, y in dataset.items()}
shapes
{'1year': (7026, 65),
'2year': (10172, 65),
'3year': (10502, 65),
'4year': (9791, 65),
'5year': (5909, 65)}
plt.figure(figsize=(12,6))
plt.title('Total number of companies in each dataset', fontweight='bold')
ax = plt.bar(shapes.keys(), [s[0] for s in shapes.values()],
color=['darkred' if s[0] < 10000 else 'darkblue' for s in shapes.values()])
bankruptcy = pd.concat([x.X65.value_counts() for x in dataset.values()], axis=1, ignore_index=True)
bankruptcy.columns = dataset.keys()
bankruptcy.index = ['no bankrupt', 'bankrupt']
bankruptcy
| 1year | 2year | 3year | 4year | 5year | |
|---|---|---|---|---|---|
| no bankrupt | 6755 | 9772 | 10007 | 9276 | 5499 |
| bankrupt | 271 | 400 | 495 | 515 | 410 |
ax = bankruptcy.T.plot.bar(log=True, color=['lime','crimson'], figsize=(12,6), title='Bankrupt companies')
for p in ax.patches:
ax.annotate(p.get_height(), (p.get_x() + p.get_width() / 2., 10), ha = 'center', va = 'center')
In these datasets, the missing values are depicted by a "?"
missing_val = {k:(v == '?').sum().sum() for k,v in dataset.items()}
freq = {k:(v == '?').sum().sum() / np.multiply(*v.shape) for k,v in dataset.items()}
f, ax = plt.subplots(1,2, figsize=(15,6))
f.suptitle('Missing values')
ax[0].barh(list(missing_val.keys()), missing_val.values(), color='purple')
ax[0].set_title('Total number')
ax[1].barh(list(freq.keys()), freq.values(), color='orange')
ax[1].set_title('Frequency')
Text(0.5, 1.0, 'Frequency')
n = 3
pd.options.display.max_rows = 100
missing = (dataset[f'{n}year'] == '?')
color = ['r' if x > 100 else ('orange' if x > 10 else 'limegreen') for x in missing.sum(axis=0)]
missing.sum(axis=0).plot(kind='bar', figsize=(15,5), log=True, color=color)
plt.title(f'Number of missing values per variable in the {n}th dataset')
Text(0.5, 1.0, 'Number of missing values per variable in the 3th dataset')
missing_col = pd.DataFrame({'missing values': missing.sum(axis=0).values},
index = missing.columns.values)
missing_col = np.array([(x,', '.join(y.index.values))
for x,y in missing_col.groupby('missing values')])
missing_col = pd.DataFrame({'Number of missing values':missing_col[:,0].astype(int)},
index=missing_col[:,1])
missing_col.sort_values('Number of missing values', ascending=False)\
.style.background_gradient(cmap='Blues', vmax=500, vmin=20, low=0.1, high=0.9)
| Number of missing values | |
|---|---|
| X37 | 4736 |
| X21 | 807 |
| X27 | 715 |
| X60 | 592 |
| X45 | 591 |
| X28, X53, X54, X64 | 228 |
| X24 | 227 |
| X41 | 202 |
| X32 | 101 |
| X47, X52 | 86 |
| X13, X19, X20, X23, X30, X31, X39, X42, X43, X44, X49, X56, X62 | 43 |
| X58 | 29 |
| X5 | 25 |
| X4, X12, X33, X40, X46, X63 | 18 |
| X61 | 17 |
| X8, X16, X17, X26, X34, X50 | 14 |
| X15 | 8 |
| X9 | 3 |
| X1, X2, X3, X6, X7, X10, X11, X14, X18, X22, X25, X29, X35, X36, X38, X48, X51, X55, X57, X59, X65 | 0 |
missing.sum(axis=1).sort_values(ascending=False).head(50).plot(kind='bar', figsize=(15,5))
plt.title('Max number of missing values per row')
Text(0.5, 1.0, 'Max number of missing values per row')
useless_col = missing.sum(axis=0)[missing.sum(axis=0) >= 50]
useless_col.name = 'Useless variables'
useless_col
X21 807 X24 227 X27 715 X28 228 X32 101 X37 4736 X41 202 X45 591 X47 86 X52 86 X53 228 X54 228 X60 592 X64 228 Name: Useless variables, dtype: int64
missing.drop(columns=useless_col.index, inplace=True)
useless_row = missing.sum(axis=1)[missing.sum(axis=1) >= 5]
useless_row.name = 'Useless companies'
len(useless_row)
58
missing.drop(index=useless_row.index, inplace=True)
print("old dataset : ", (dataset[f'{n}year'] == '?').sum().sum(), "omitted\nshape :", (dataset[f'{n}year'] == '?').shape)
print("dataset cleaned : ", missing.sum().sum(), "omitted\nshape :", missing.shape)
old dataset : 9888 omitted shape : (10502, 65) dataset cleaned : 36 omitted shape : (10444, 51)
Dataset of the nth year with far fewer missing values
data = dataset[f'{n}year'].copy()
data.drop(columns=useless_col.index, inplace=True)
data.drop(index=useless_row.index, inplace=True)
Replace the last omitted values by the mean
for col in data:
data.loc[data[col] == '?', col] = data.loc[data[col] != '?', col].astype(float).mean()
data = data.astype(float)
data.X65 = data.X65.astype('category')
(data == '?').sum().sum()
0
plt.figure(figsize=(12,6))
fig = make_subplots(2, 10, horizontal_spacing=0.02)
idx = np.array([(x,y) for x in range(1,3) for y in range(1,11)])
predictors = ['safe' if x == 0 else 'bankrupt' for x in data['X65'].values]
for i, x in enumerate(data.drop('X65', axis=1).columns[0:20]):
row, col = idx[i]
fig.append_trace(go.Box(y=data[x], x=predictors, name=x), row=row, col=col)
y_min, y_max = data[x].sort_values().iloc[30], data[x].sort_values().iloc[-30]
fig.update_yaxes(range=(y_min, y_max), row=row, col=col)
fig.update_xaxes(title=x, row=row, col=col)
fig.update_layout(height=800, width=2000)
<Figure size 864x432 with 0 Axes>
cor_mat = data.corr()
plt.figure(figsize=(60,15))
plt.title('Correlation plot', fontweight='bold')
mask = np.triu(np.ones_like(cor_mat, dtype=bool))
sns.heatmap(cor_mat, square=True, annot=True, cmap='seismic', mask=mask, fmt='.1f')
<AxesSubplot:title={'center':'Correlation plot'}>
From this correlation plot and the analysis of the variable to explain, we can see that multiple variables are highly correlated, or can be avoided due to their variation, so we could ejected some of them without loss of information, in order to improve the performances of our future ML models.
Let's realise a PCA to analyse which variables are the most efficients
We start by standardize the data matrix and then apply a PCA
ss_data = StandardScaler().fit_transform(data.drop('X65', axis=1))
pca = PCA()
pc_data = pca.fit_transform(ss_data)
To see how many variables will really be needed to the future model, the fewer variables there are, the more our model will be easy to train and fast
f = plt.figure(figsize=(25,12))
pcs = [f'C{i}' for i in range(ss_data.shape[1])]
cumul = pca.explained_variance_ratio_.cumsum()
cumul70 = np.where(cumul > 0.7)[0][0]
cumul99 = np.where(cumul > 0.99)[0][0]
sns.barplot(pcs, cumul)
sns.lineplot(pcs, cumul, marker='o', ms=10)
plt.axvline(cumul70, 0, 1, c='orange', lw=6, ls=':')
plt.text(cumul70 + 0.5, 0.1, 'Upper to 70%', size=30)
plt.axvline(cumul99, 0, 1, c='lime', lw=6, ls=':')
plt.text(cumul99 + 0.5, 0.1, 'Upper to 99%', size=30)
plt.title('Cumulative explained variance in percentage', dict(fontsize=20, fontweight='bold'))
plt.xlabel('Principal components')
ax = f.add_axes([0.6, 0.4, 0.2, 0.3], title='Explained variance ratio - eigenvalues')
ax.get_xaxis().set_visible(False)
sns.barplot(pcs, pca.explained_variance_ratio_, log=True)
<matplotlib.axes._axes.Axes at 0x236b770ba90>
Here, we won't show the graph of the individuals because of the size of the dataset, so we will only show the graph of variables
eig_vecs = pca.components_[:, 0:2]
arrows = []
for i in range(len(eig_vecs)):
arrows.append(
go.layout.Annotation(dict(
x= eig_vecs[i,0], y= eig_vecs[i,1], ax= 0, ay= 0,
xref="x", yref="y", axref = "x", ayref='y',
showarrow=True, arrowhead = 3, arrowwidth=1.5, arrowcolor='red'))
)
arrows.append(
go.layout.Annotation(dict(
x= eig_vecs[i,0], y= eig_vecs[i,1],
showarrow=False, text=f'X{i}'))
)
f = go.Figure(go.Scatter())
f.update_layout(title='Loadings plot',annotations=arrows)
It's quite difficult to interpret this type of graph, it is confusing. However, we can see immediately that almost all variables are misrepresented because the arrow doesn't reach the boundary of a unit circle